import pandas as pd
import plotnine as p9
from plotnine import *
from plotnine.data import *
import numpy as np
Time Series Forecasting with Python: Method 1
Loading packages
Read data
= pd.read_csv('AirPassengers.csv')
airpassenger 'Month']= pd.to_datetime(airpassenger['Month']) airpassenger[
Visualise data
='Month', y='#Passengers'))+geom_line() ggplot(airpassenger, aes(x
Training set vs Test set
# Define training and test separation point
= '1960-01'
split_date
# Visualize data
= (
g ='Month', y='#Passengers')) +
ggplot(airpassenger, aes(x='blue') +
geom_line(color=pd.to_datetime(split_date), linetype='dashed', color='red') +
geom_vline(xintercept
labs(='Air Passenger Counts Over Time',
title='Month',
x='Number of Passengers'
y+
)
theme_minimal()
)
# Print plot
print(g)
Apply log transformation
'Log_Passengers'] = np.log(airpassenger['#Passengers'])
airpassenger[
# Define training and test separation point
= '1960-01-01'
split_date
# Visualize data
= (
g ='Month', y='Log_Passengers')) +
ggplot(airpassenger, aes(x='blue') +
geom_line(color=pd.to_datetime(split_date), linetype='dashed', color='red') +
geom_vline(xintercept
labs(='Log-Transformed Air Passenger Counts Over Time',
title='Month',
x='Log(Number of Passengers)'
y+
)
theme_minimal()
)
# Print plot
print(g)
Set training data set and test dataset
= airpassenger[airpassenger['Month'] < pd.to_datetime(split_date)]
training_data = airpassenger[airpassenger['Month'] >= pd.to_datetime(split_date)] test_data
Plot ACF and PACF for training data
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
=(12, 6))
plt.figure(figsize1, 2, 1)
plt.subplot('Log_Passengers'], lags=40, ax=plt.gca())
plot_acf(training_data['ACF of Log-Transformed Training Data')
plt.title(
1, 2, 2)
plt.subplot('Log_Passengers'], lags=40, ax=plt.gca(), method='ywm')
plot_pacf(training_data['PACF of Log-Transformed Training Data')
plt.title(
plt.tight_layout() plt.show()
Apply First-Order Seasonal Difference and Obtain ACF and PACF
'LogSeasonal_Diff'] = airpassenger['Log_Passengers'] - airpassenger['Log_Passengers'].shift(12)
airpassenger[14) airpassenger.head(
Month | #Passengers | Log_Passengers | LogSeasonal_Diff | |
---|---|---|---|---|
0 | 1949-01-01 | 112 | 4.718499 | NaN |
1 | 1949-02-01 | 118 | 4.770685 | NaN |
2 | 1949-03-01 | 132 | 4.882802 | NaN |
3 | 1949-04-01 | 129 | 4.859812 | NaN |
4 | 1949-05-01 | 121 | 4.795791 | NaN |
5 | 1949-06-01 | 135 | 4.905275 | NaN |
6 | 1949-07-01 | 148 | 4.997212 | NaN |
7 | 1949-08-01 | 148 | 4.997212 | NaN |
8 | 1949-09-01 | 136 | 4.912655 | NaN |
9 | 1949-10-01 | 119 | 4.779123 | NaN |
10 | 1949-11-01 | 104 | 4.644391 | NaN |
11 | 1949-12-01 | 118 | 4.770685 | NaN |
12 | 1950-01-01 | 115 | 4.744932 | 0.026433 |
13 | 1950-02-01 | 126 | 4.836282 | 0.065597 |
12) airpassenger.tail(
Month | #Passengers | Log_Passengers | LogSeasonal_Diff | |
---|---|---|---|---|
132 | 1960-01-01 | 417 | 6.033086 | 0.146982 |
133 | 1960-02-01 | 391 | 5.968708 | 0.133897 |
134 | 1960-03-01 | 419 | 6.037871 | 0.031518 |
135 | 1960-04-01 | 461 | 6.133398 | 0.151984 |
136 | 1960-05-01 | 472 | 6.156979 | 0.116724 |
137 | 1960-06-01 | 535 | 6.282267 | 0.125288 |
138 | 1960-07-01 | 622 | 6.432940 | 0.126665 |
139 | 1960-08-01 | 606 | 6.406880 | 0.080731 |
140 | 1960-09-01 | 508 | 6.230481 | 0.092754 |
141 | 1960-10-01 | 461 | 6.133398 | 0.124585 |
142 | 1960-11-01 | 390 | 5.966147 | 0.074503 |
143 | 1960-12-01 | 432 | 6.068426 | 0.064539 |
Remove rows with NaN values from the airpassenger DataFrame
=True)
airpassenger.dropna(inplace
# Display the cleaned DataFrame
print(airpassenger)
Month #Passengers Log_Passengers LogSeasonal_Diff
12 1950-01-01 115 4.744932 0.026433
13 1950-02-01 126 4.836282 0.065597
14 1950-03-01 141 4.948760 0.065958
15 1950-04-01 135 4.905275 0.045462
16 1950-05-01 125 4.828314 0.032523
.. ... ... ... ...
139 1960-08-01 606 6.406880 0.080731
140 1960-09-01 508 6.230481 0.092754
141 1960-10-01 461 6.133398 0.124585
142 1960-11-01 390 5.966147 0.074503
143 1960-12-01 432 6.068426 0.064539
[132 rows x 4 columns]
Plot ACF and PACF of the log-seasonally differenced series
= plt.subplots(1, 2, figsize=(16, 6))
fig, axes
# ACF plot
'LogSeasonal_Diff'], ax=axes[0], lags=40)
plot_acf(airpassenger[0].set_title('ACF of Log-Seasonally Differenced Series')
axes[
# PACF plot
'LogSeasonal_Diff'], ax=axes[1], lags=40)
plot_pacf(airpassenger[1].set_title('PACF of Log-Seasonally Differenced Series')
axes[
plt.tight_layout() plt.show()
Apply First-Order Differencing
'LogSeasonal_Diff.NonSeaDiff'] = airpassenger['LogSeasonal_Diff'] - airpassenger['LogSeasonal_Diff'].shift(1)
airpassenger[14)
airpassenger.head(=True) airpassenger.dropna(inplace
= plt.subplots(1, 2, figsize=(16, 6))
fig, axes
# ACF plot
'LogSeasonal_Diff.NonSeaDiff'], ax=axes[0], lags=40)
plot_acf(airpassenger[0].set_title('ACF of Log-Seasonally Differenced and Non Seasonally Differenced Series')
axes[
# PACF plot
'LogSeasonal_Diff.NonSeaDiff'], ax=axes[1], lags=40)
plot_pacf(airpassenger[1].set_title('PACF of Log-Seasonally Differenced and Non-Seasonally DIfferenced Series')
axes[
plt.tight_layout() plt.show()
Fit ARIMA(0,1,1)(0,1,1)[12] model
from statsmodels.tsa.arima.model import ARIMA
= ARIMA(airpassenger['Log_Passengers'], order=(0, 1, 1), seasonal_order=(0, 1, 1, 12))
model = model.fit()
fitted_model = fitted_model.forecast(steps=13)
forecast forecast
C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:473: ValueWarning: An unsupported index was provided and will be ignored when e.g. forecasting.
C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: ValueWarning: No supported index is available. Prediction results will be given with an integer index beginning at `start`.
C:\Users\DELL\AppData\Local\Programs\Python\Python312\Lib\site-packages\statsmodels\tsa\base\tsa_model.py:836: FutureWarning: No supported index is available. In the next version, calling this method in a model without a supported index will result in an exception.
131 6.110396
132 6.053819
133 6.171650
134 6.199400
135 6.232725
136 6.368919
137 6.507530
138 6.503117
139 6.324834
140 6.209196
141 6.063624
142 6.168173
143 6.206753
Name: predicted_mean, dtype: float64
Residual Analysis
import seaborn as sns
# Compute residuals
= fitted_model.fittedvalues
train_forecast = airpassenger['Log_Passengers'] - train_forecast
residuals
# Plot residuals
=(10, 6))
plt.figure(figsize'Month'], residuals, label='Residuals', color='purple')
plt.plot(airpassenger[0, color='black', linewidth=1)
plt.axhline('Residuals from ARIMA Model')
plt.title('Month')
plt.xlabel('Residuals')
plt.ylabel(
plt.legend()True)
plt.grid(
plt.show()
# Plot the ACF and PACF of residuals to check for autocorrelation
=(12, 6))
plt.figure(figsize
1, 2, 1)
plt.subplot(=20, ax=plt.gca())
plot_acf(residuals, lags'ACF of Residuals')
plt.title(
1, 2, 2)
plt.subplot(=20, ax=plt.gca())
plot_pacf(residuals, lags'PACF of Residuals')
plt.title(
plt.tight_layout()
plt.show()
# Histogram of residuals to check for normality
=(10, 6))
plt.figure(figsize=True, color='purple', bins=20)
sns.histplot(residuals, kde'Histogram of Residuals')
plt.title('Residuals')
plt.xlabel('Frequency')
plt.ylabel( plt.show()
Back-transform forecasted log values to original scale
= np.exp(forecast)
forecast_original 'forecast_log'] = forecast
test_data['forecast_passengers'] = forecast_original
test_data[ test_data
C:\Users\DELL\AppData\Local\Temp\ipykernel_9236\3410330437.py:2: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
C:\Users\DELL\AppData\Local\Temp\ipykernel_9236\3410330437.py:3: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
Month | #Passengers | Log_Passengers | forecast_log | forecast_passengers | |
---|---|---|---|---|---|
132 | 1960-01-01 | 417 | 6.033086 | 6.053819 | 425.735873 |
133 | 1960-02-01 | 391 | 5.968708 | 6.171650 | 478.975744 |
134 | 1960-03-01 | 419 | 6.037871 | 6.199400 | 492.453690 |
135 | 1960-04-01 | 461 | 6.133398 | 6.232725 | 509.141110 |
136 | 1960-05-01 | 472 | 6.156979 | 6.368919 | 583.426973 |
137 | 1960-06-01 | 535 | 6.282267 | 6.507530 | 670.168960 |
138 | 1960-07-01 | 622 | 6.432940 | 6.503117 | 667.218211 |
139 | 1960-08-01 | 606 | 6.406880 | 6.324834 | 558.265074 |
140 | 1960-09-01 | 508 | 6.230481 | 6.209196 | 497.301356 |
141 | 1960-10-01 | 461 | 6.133398 | 6.063624 | 429.930518 |
142 | 1960-11-01 | 390 | 5.966147 | 6.168173 | 477.313188 |
143 | 1960-12-01 | 432 | 6.068426 | 6.206753 | 496.087694 |
Compute MSE
from sklearn.metrics import mean_squared_error
= mean_squared_error(test_data['#Passengers'], test_data['forecast_passengers'])
mse
print(f'Mean Squared Error (MSE): {mse}')
Mean Squared Error (MSE): 5279.13204916217